An effective data visualization:
A great visualization has some additional properties:
data <- tibble( x=rnorm(100), y=rnorm(100,0,5) ) data %>% ggplot(aes(x=x,y=y)) + geom_point()
data <- mutate(data, `x times y`=abs(x*y) ) data %>% ggplot(aes(x=x,y=y,size=`x times y`)) + geom_point()
data <- mutate(data,
z=runif(100),
category=sample(c('A','B','C'), 100, replace=TRUE)
)
data %>%
ggplot(aes(x=x, y=y, size=`x times y`, color=z, shape=category)) +
geom_point()
data <- mutate(data, `x + y`=x+y) %>%
arrange(`x + y`) %>%
mutate(
xend=lag(x,1),
yend=lag(y,1)
)
data %>%
ggplot() +
geom_segment(aes(x=x,xend=xend,y=y,yend=yend,alpha=0.5)) +
geom_point(aes(x=x, y=y, size=`x times y`, color=z, shape=category))
tibble( value=rnorm(100), category='A' ) %>% ggplot(aes(x=category,y=value,fill=category)) + geom_violin()
| Plot | \(x\) encoding | \(y\) encoding | note |
|---|---|---|---|
| scatter | position | position | |
| vertical bar | position | height | |
| horizontal bar | width | position | |
| lollipop | position | height for line + position for “head” | |
| violin | position | width | \(x\) transformed to range, \(y\) transformed to densities |
# two groups of samples with similar random data profiles
data <- as.matrix(
tibble(
A=c(rnorm(10),rnorm(10,2)),
B=c(rnorm(10),rnorm(10,2)),
C=c(rnorm(10),rnorm(10,2)),
D=c(rnorm(10,4),rnorm(10,-1)),
E=c(rnorm(10,4),rnorm(10,-1)),
F=c(rnorm(10,4),rnorm(10,-1))
)
)
rownames(data) <- paste0('G',seq(nrow(data)))
heatmap(data)
## Registered S3 method overwritten by 'GGally': ## method from ## +.gg ggplot2
library(patchwork)
data <- tibble(
percent=c(86,88,87,90,93,89),
ID=c('A','B','C','D','E','F')
)
g <- ggplot(data, aes(x=ID,y=percent)) +
geom_bar(stat="identity")
g + coord_cartesian(ylim=c(85,95)) | g
ggplot2ggplot2 Fundamentalsggplot2 ExampleA simple example dataset:
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2 Exampleage_at_deathtauggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau
)
) +
geom_point()ggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot2 Plot Componentsggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot() - function creates a plotdata= - pass a tibble with the datamapping=aes(...) - Define an aesthetics mapping connecting data to plot propertiesgeom_point(...) - Specify geometry as points where marks will be made at pairs of x,y coordinatesIs this the whole story?
There are both AD and Control subjects in this dataset!
How does condition relate to this relationship we see?
Layer on an additional aesthetic of color:
ggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau,
color=condition # color each point
)
) +
geom_point()
ggplot(data=ad_metadata, mapping=aes(
x=age_at_death, y=tau, color=condition
)) + geom_point()
Differences in distributions of variables can be important
Examine the distribution of age_at_death for AD and Control samples with violin geometry with geom_violin():
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
patchwork library:library(patchwork)
age_boxplot <- ggplot(
data=ad_metadata,
mapping = aes(x=condition, y=age_at_death)
) +
geom_boxplot()
tau_boxplot <- ggplot(
data=ad_metadata,
mapping=aes(x=condition, y=tau)
) +
geom_boxplot()
age_boxplot | tau_boxplot # this puts the plots side by side
age_boxplot | tau_boxplot # this puts the plots side by side
ggplot Mechanicsggplot Mechanicsggplot has two key concepts that give it great flexibility: layers and scalesggplot LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X() function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot Layersggplot Scalesggplot ScalesHow many layers? How many scales?
ggplot Incompatible Scalesggplot(data=ad_metadata, mapping=aes(x=braak_stage)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=age_at_death, color='red'))
ggplot Incompatible Scalesggplot(ad_metadata,
mapping = aes(
x=ID,
y=tau)
) +
geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes(x=ID, y=tau_centered, fill=condition)) + geom_bar(stat="identity")
geom_point and geom_segment layers:ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(aes(x=ID,y=Intensity,group=Marker,fill=Marker)) +
geom_area()
Stacked area plots require three pieces of data:
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
group_by(ID) %>% # divide each intensity values by sum markers
mutate(
`Relative Intensity`=Intensity/sum(Intensity)
) %>%
ungroup() %>% # ungroup restores the tibble to original number of rows
ggplot(aes(x=ID,y=`Relative Intensity`,group=Marker,fill=Marker)) +
geom_area()
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=10)
tibble(x=rnorm(1000)) %>% ggplot() + geom_histogram(aes(x=x))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
) %>%
ggplot(aes(x=x,fill=type)) +
geom_histogram(bins=30, alpha=0.6, position="identity")
Similar to histogram, except instead of binning the values draws a smoothly interpolated line that approximates the distribution
Density plot is always normalized so the integral under the curve is approximately 1
ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa")
library(patchwork) hist_g <- ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=30) density_g <- ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa") hist_g | density_g
library(patchwork)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
)
hist_g <- ggplot(normal_samples) +
geom_histogram(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity", bins=30
)
density_g <- ggplot(normal_samples) +
geom_density(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity"
)
hist_g | density_g
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
Boxplot anatomy. IQR stands for “inner quartile range”, the distance between the 1st and 3rd quartile - Wikipedia
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=x,fill=type,alpha=0.6)) +
geom_density()
ggplot(normal_samples, aes(x=type,y=x,fill=type)) + geom_boxplot()
library(patchwork) g <- ggplot(normal_samples, aes(x=type,y=x,fill=type)) boxplot_g <- g + geom_boxplot() violin_g <- g + geom_violin() boxplot_g | violin_g
ggplot(ad_metadata) + geom_violin(aes(x=condition,y=tau,fill=condition))
library(ggbeeswarm)
ggplot(ad_metadata) +
geom_beeswarm(
aes(x=condition,y=age_at_death,color=condition),
cex=2,
size=2
)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=type,y=x,color=type)) +
geom_beeswarm()
ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
normal_samples <- tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
category=sample(c('healthy','disease'),300,replace=TRUE)
)
ggplot(normal_samples, aes(x=type,y=x,color=category)) +
geom_beeswarm()
library(ggridges)
tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges()
## Picking joint bandwidth of 0.822
tibble(
x=rnorm(10000,mean=runif(10,1,10),sd=runif(2,1,4)),
type=rep(c("A","B","C","D","E","F","G","H","I","J"),1000)
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges(alpha=0.6,position="identity")
## Picking joint bandwidth of 0.494
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3)
g <- ggplot()
for(x in 0:5) {
for(y in 0:4) {
if(x+y*6 < 26) {
g <- g + geom_point(
tibble(x=x,y=y),
aes(x=x,y=y),
shape=x+y*6,size=8) +
geom_label(
tibble(x=x,y=y,label=x+y*6),
aes(x=x,y=y+0.5,label=label)
)
}
}
}
g
scale_shape_manual():ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3) + scale_shape_manual(values=c(3,9))
library(patchwork) g <- ggplot(ad_metadata) g_condition <- g + geom_point(mapping=aes(x=abeta, y=tau, color=condition),size=3) g_age <- g + geom_point(mapping=aes(x=abeta, y=tau, color=age_at_death),size=3) g_condition / g_age
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, size=age_at_death)) + geom_point(alpha=0.5)
arrange(ad_metadata,age_at_death) %>%
mutate(
x=abeta,
xend=lag(x,1),
y=tau,
yend=lag(y,1)
) %>%
ggplot() +
geom_segment(aes(x=abeta, xend=xend, y=tau, yend=yend)) +
geom_point(aes(x=x,y=y,shape=condition,color=condition),size=3)
geom_line() function draws lines between pairs of points sorted by \(x\) axis by defaultggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) +
geom_line()
group aesthetic mappingpivot_longer(ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(ad_metadata,mapping=aes(x=ID, y=Intensity, group=Marker, color=Marker)) +
geom_line()
group aesthetic mappinglibrary(GGally)
ggparcoord(ad_metadata,
columns=c(2,4:8),
groupColumn=3,
showPoints=TRUE
) +
scale_color_manual(values=c("#bbbbbb", "#666666"))
0 - f, 0=0 and f=16#rrggbb
rr is the value of the color redgg for greenbb for blue#ffffff is the color
white#000000 is the color
black#ff0000 is the color
red#0000ff is the color
blue#7fff00 is
this color (called chartreuse)heatmap() function:heatmap() function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap() Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap() Drawbackheatmap() function has the major drawback that no color key is provided!heatmap.2() in gplots package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2() Example## ## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges': ## ## space
## The following object is masked from 'package:S4Vectors': ## ## space
## The following object is masked from 'package:stats': ## ## lowess
The extra decoration (dashed and solid vertical lines called traces by the package) provide another way to understand the magnitude of the value in each cell. Refer to the gplots documentation of heatmap2 for more information on how to interpret the trace (hint: you may turn it off with by passing the argument trace = "none" to the function call).
Both heatmap() and heatmap.2() provide a useful method to annotate rows and columns with a categorical variable. Each of our subjects either has AD or is a control, and we can add a color bar along the margin of the plot to indicate the status of each subject:
condition_colors <-
transmute(
ad_metadata,
color=if_else(condition == "AD","red","blue")
)
heatmap(
marker_matrix,
RowSideColors=condition_colors$color
)
And with heatmap.2:
heatmap.2( marker_matrix, RowSideColors=condition_colors$color )
Heatmaps may also be created using ggplot and the geom_tile geometry. This method of generating heatmaps is more “manual” than the other functions covered so far, and so are both flexible and require more work to obtain certain functionality (e.g. clustering, color bars on side margins, etc). This geometry requires the data to be in long format, with one column for x, y, and z values:
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile()
Note this ggplot method does not scale or reorder rows or columns.
The colors of the heatmap may be changed by passing one of the native color palettes to the heatmap function with the col argument:
# native R colors are: # - rainbow(n, start=.7, end=.1) # - heat.colors(n) # - terrain.colors(n) # - topo.colors(n) # - cm.colors(n) # the n argument specifies the number of colors (i.e. resolution) of the colormap to return heatmap(marker_matrix,col=cm.colors(256))
To change the color using ggplot and geom_tile(), use the scale_fill_gradientn function to specify a different color palette:
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile() + scale_fill_gradientn(colors=cm.colors(256))
The native color palettes in R look as follows:
Instead of these color palettes, you may use one of the ColorBrewer palettes in the RColorBrewer package
library(RColorBrewer) display.brewer.all()
There are very many color palettes to choose from, and you may also create your own palettes. Creating color palettes is somewhat complicated in base R, but ggplot() provides convenient functions scale_fill_gradient() and scale_fill_gradient2() to produce gradient scales with 2 or 3 colors, respectively. More discussion on how to choose colors is described in the next section.
While heatmaps may seem intuitive, they are actually quite complicated and can be difficult to create in a way that accurately depicts the underlying matrix. As described in the [Human Visual Perception] section, our visual system estimates the hue and value of colors based on adjacent colors. For large heatmaps with many rows and columns, it is therefore very difficult to accurately estimate relationships across wide sections of the heatmap. This may not always be problematic, as heatmaps might only be needed to convey general patterns in a dataset. However, the designer has a large degree of control over how the heatmap utilizes hue and value when mapping to the underlying numbers, and those choices have a large impact on the interpretation of the chart. Before data can be effectively visualized in a heatmap, some preprocessing is usually required to yield a useful plot.
There are four major factors that influence how a dataset can be visualized as a heatmap:
There are two broad types of features: discrete and continuous. The data elements of a discrete feature take on one of a finite set of values. For example, a discrete feature might include values like 0 and 1, Case and Control or only the integers between 1 and 6. In general, we are primarily interested to know whether samples with the same discrete value group together, so ideally all colors that are mapped to discrete features will be easily distinguishable from one another. As the number of possible values the feature may take increases, finding a set of colors that are all distinguishable from one another becomes increasingly difficult. Consider the following three heatmaps that show different discrete data:
library(patchwork)
random_letters <- function(n) {
letters <- c('A','B','C','D','E','F','G','H','I','J')
sample(letters[1:n],10,replace=TRUE)
}
features <- paste0('F',seq(10))
g_binary <- tibble(
feature=features,
a=random_letters(2),
b=random_letters(2),
c=random_letters(2)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=feature,fill=value)) + geom_tile()
g_trinary <- tibble(
feature=paste0('F',seq(10)),
a=random_letters(3),
b=random_letters(3),
c=random_letters(3)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=feature,fill=value)) + geom_tile()
g_unmanageable <- tibble(
feature=paste0('F',seq(10)),
a=random_letters(10),
b=random_letters(10),
c=random_letters(10)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=feature,fill=value)) + geom_tile()
g_binary | g_trinary | g_unmanageable
The first and second heatmaps have 2 and 3 discrete values each, and the colors are easy to tell apart. The third heatmap has 10 discrete values and it is very difficult to distinguish between colors in different parts of the plot.
Continuous features can take any of a (possibly infinite) range of real values, e.g. \(-3.252\), \(858,352\), or \(0\). These values will be mapped to colors of a gradient, where two or more colors are blended together linearly from one color to another, and values are mapped within this range of colors in proportion to their relative magnitudes. Two gradients, one with two colors and one with three, are shown below:
g_two_up <- tibble( ID='a', x=seq(0,1,0.1), val=x ) %>% ggplot(aes(x=x,y=ID,fill=val)) + geom_tile() + scale_fill_gradient(low="#990000", high="#ffdddd") g_two_down <- tibble( ID='a', x=seq(0,1,0.1), val=x ) %>% ggplot(aes(x=x,y=ID,fill=val)) + geom_tile() + scale_fill_gradient(low="#ffdddd", high="#990000") g_three <- tibble( ID='a', x=seq(0,1,0.1), val=x ) %>% ggplot(aes(x=x,y=ID,fill=val)) + geom_tile() + scale_fill_gradient2(low="#990000", mid="#ffffff", high="#9999ff", midpoint=0.5) g_two_up / g_two_down / g_three
These three example gradients comprise the most common types of color mappings. The first two with one color differ in which color (i.e. lighter or darker) is mapped to the largest value. The third gradient is called a diverging palette, because the maximum and minimum values show divergence from some central value, which in this case is 0.5. For diverging color palettes, the value in the middle of the gradient is often very important, as it defines which values are visualized in which color. More on diverging color palettes is in the following sections.
In general, the types of features, i.e. discrete or continuous, must all be the same for a heatmap to be meaningful. Once this has been determined, the values of each feature must be examined and possibly transformed, as described next.
The second important consideration when plotting a heatmap is to understand the nature of the different features and how they relate to one another. Specifically, the features must all be on a comparable scale, or transformed appropriately to attain this, for the colors in the heatmap to be visible. Consider the following heatmap with three features plotted as columns and 10 samples as rows:
data <- tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,100,20),
c=rnorm(10,20,5)
) %>%
pivot_longer(c(a,b,c))
ggplot(data,aes(x=name,y=ID,fill=value)) +
geom_tile()
The scales of the different features \(a\), \(b\), and \(c\) are different, namely they have means of 0, 100, and 20, respectively. This an be clearly seen when the data are plotted as a beeswarm plot:
library(ggbeeswarm) ggplot(data) + geom_beeswarm(aes(x=name,y=value,color=name))
As the data are on dramatically different scales, only the feature with the largest spread (b) can be distinguished in the heatmap.
In order to visualize all these data meaningfully in the same plot, the features must be scaled so they are all comparable. Typically this is done by standardizing the features to have a mean of zero and a standard deviation of 1:
data %>%
pivot_wider(id_cols='ID',names_from=name) %>%
mutate(
across(c(a,b,c),scale)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
Now each feature has a mean of zero and standard deviation of one:
data %>%
pivot_wider(id_cols='ID',names_from=name) %>%
mutate(
across(c(a,b,c),scale)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot() +
geom_beeswarm(aes(x=name,y=value,color=name))
Now that the data are all on the same scale, each is mapped across the full range of the color palette. In this case, we explicitly scaled each feature, but it is possible to scale by each sample as well:
data %>%
pivot_wider(id_cols=name,names_from=ID) %>%
mutate(
across(starts_with('F'),scale)
) %>%
pivot_longer(starts_with('F'),names_to="ID") %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
This result is dramatically different, and possibly not what you intended. In this case the effect is somewhat obvious - all the values in b are the largest, the values in c are the smallest, and c is in between. However, in general this may not be the case, and care should be taken to understand which dimension of the data has been scaled.
Because features are typically not assumed to be on the same scale, the base R heatmap() and heatmap2() functions automatically scale the rows of the data prior to visualization.
As we saw in the previous section on scaling features, the color mapping is highly sensitive to the range of all values in the data. Even if the features or samples have been properly scaled, extreme values in any of the features can render a heatmap uninformative. Consider the following heatmap that visualizes data that have a single extreme outlier:
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=c(rnorm(9,0,1),1e9)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
The single extreme value compressed all the other color values to the colors on the lowest end of the color palette. This anomalous value must be removed before differences between the other variables will be visible.
The issue above is caused by the fact that the default color gradient scales assume there is a linear mapping between the values and the palette. This is equivalent to assuming the data are relatively well distributed across a linear range of values, as in a normal or uniform distribution. When the data do not follow this assumption, any resulting heatmaps may suffer the same issue as when there are extreme values. Consider the following heatmap with data randomly sampled from an exponential distribution:
data <- tibble(
ID=paste0('F',seq(10)),
a=10**rnorm(10,4,1),
b=10**rnorm(10,4,1),
c=10**rnorm(10,4,1)
) %>%
pivot_longer(c(a,b,c))
data %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
The data are similarly compressed toward the lower end of the scale due to the uneven distribution of data. In this case, we might consider taking a log transformation of the data prior to drawing the heatmap:
mutate(data,
value=log10(value)
) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
On a log scale, the heatmap visualizes the range of values linearly across the gradient. In this case, we knew a log transform was appropriate to cast the data onto an approximately linear scale, though in general this will need to be explored for each dataset.
The final important consideration when creating a heatmap with continuous values is determining if there is a meaningful center for each feature. For certain kinds of data, data points above and below a certain value have a particular meaning. For example, log2 fold change values from differential expression greater or less than zero indicate a gene is increased or decreased, respectively, in the comparison of interest. When visualizing these values as a heatmap, it is therefore important to indicate this central value with a color, usually white or black, appropriately, so that colors above and below this value can be set accordingly, e.g. above zero is red, below zero blue.
To do this, first a 3-color diverging palette is required, i.e. a palette with one color on one side, a different color on the other side, and a third color in the middle, usually white or black. The following is a heatmap of normally distributed values with mean 0 and standard deviation of 1 visualized on a diverging color scale:
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=rnorm(10,0,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000", midpoint=0)
Note the scale_fill_gradient2() call, with midpoint=0 argument. This argument instructs the gradient to set the value of 0 to the middle color, which is white. The midpoint of zero is default, but if your data are centered around some other value, failing to specify this midpoint will produce misleading visualizations:
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,1,1),
b=rnorm(10,1,1),
c=rnorm(10,1,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000")
Here our data are centered with a mean of 1, but the midpoint is still at its default of zero. If we intend for 1 to be the midpoint corresponding to the color white, we must specify it explicitly:
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,1,1),
b=rnorm(10,1,1),
c=rnorm(10,1,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000", midpoint=1)
If your features do not have a meaningful center, then the color palette scale should generally have only two colors that range from either light to dark or dark to light:
data <- tibble(
ID=paste0('F',seq(10)),
a=runif(10,0,10),
b=runif(10,0,10),
c=runif(10,0,10)
) %>%
pivot_longer(c(a,b,c))
g_up <- ggplot(data, aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient(low="#000099",high="#eeeeff")
g_dn <- ggplot(data, aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient(low="#ffffff",high="#000099")
g_up | g_dn
The direction of the color palette depends on the meaning of the data, and is a subjective choice.